library(tidyverse)
library(tidyr)
library(rvest)
library(httr)
library(usmap)
library(plotly)
library(sf)
Goal: Webscrape data from usnews about US state rankings and scores in certain departments. Product: Create interactive plot, where the user can hover and see data for whatever state. Data: State rankings Interpretation: Washington state is clearly the best state, but most others can vary greatly in what they excel at and what they need to work on.
First, check to see if we are allowed to pull data from that website.
library(robotstxt)
paths_allowed("http://www.usnews.com")
##
www.usnews.com
## [1] TRUE
Read the url.
url<-"https://www.usnews.com/news/best-states/rankings"
#page <- read_html(url)
#page
Pull the data I want. This did not work and the website wasn’t really set up well to use the selector gadget. When trying to select the data I wanted, the whole page would highlight instead and I couldn’t grab just the important data. Instead, the website seems like I could just copy/paste into a csv and import to R that way.
#page %>%
#html_nodes("._h3_cuogz_1")
The data set has the overall rank of each state and the rank on 8 variables: health, education, economy, infrastructure, opportunity, fiscal stability, crime and corrections and natural environment. USNews gave more weight to health and education than other variables when calculating the overall rankings due to people thinking those are more important. Treating each variable as equal, if we total the scores across all 8 variables, then the lowest score would be the objectively best state.
ranks <- read_csv("state_rankings.csv", show_col_types = FALSE)
Let’s total the scores in the 8 variables and see how the states stack up across all variables.
ranks<-ranks %>%
mutate(Total = select(., health_care:natural_environment) %>%
rowSums(na.rm = TRUE))
On first glance, these totals are roughly the same as the overall rankings with higher totals receiving higher rankings, so it doesn’t seem useful to review the overall rankings.
state_ranks <- ranks[,c("state", "rank")]
p<-plot_usmap(data=state_ranks, values = "rank")+
scale_fill_continuous(
low = "white", high = "hotpink", name = "Education Rank")+
labs(title = "Overall State Rankings",
caption = "Lighter colors are for higher ranks.")
ggplotly(p)
a<-plot_usmap(data=ranks, values = "health_care", color="red")+
scale_fill_continuous(
low = "white", high = "red", name = "Health Care Rank")+
labs(title = "Health Care Rankings")
b<-plot_usmap(data=ranks, values = "education", color="orange")+
scale_fill_continuous(
low = "white", high = "orange", name = "Education Rank")+
labs(title = "Education Rankings")
c<-plot_usmap(data=ranks, values = "economy", color="yellow")+
scale_fill_continuous(
low = "white", high = "yellow", name = "Economy Rank")
d<-plot_usmap(data=ranks, values = "infrastructure", color="green")+
scale_fill_continuous(
low = "white", high = "green", name = "Infrastructure Rank")
e<-plot_usmap(data=ranks, values = "opportunity", color="lightblue")+
scale_fill_continuous(
low = "white", high = "lightblue", name = "Opportunity Rank")
f<-plot_usmap(data=ranks, values = "fiscal_stability", color="blue")+
scale_fill_continuous(
low = "white", high = "blue", name = "Fiscal Stability Rank")
g<-plot_usmap(data=ranks, values = "crime_corrections", color="violet")+
scale_fill_continuous(
low = "white", high = "violet", name = "Crime and Corrections Rank")
h<-plot_usmap(data=ranks, values = "natural_environment", color="pink")+
scale_fill_continuous(
low = "white", high = "pink", name = "Natural Environment Rank")
a<-ggplotly(a)
b<-ggplotly(b)
c<-ggplotly(c)
d<-ggplotly(d)
e<-ggplotly(e)
f<-ggplotly(f)
g<-ggplotly(g)
h<-ggplotly(h)
#
annotations = list(
list(
x = 0.23,
y = 1.0,
text = "Health Care",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.75,
y = 1,
text = "Education",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.23,
y = 0.73,
text = "Economy",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.75,
y = 0.73,
text = "Infrastructure",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.23,
y = 0.48,
text = "Opportunity",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.75,
y = 0.48,
text = "Fiscal Stability",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.23,
y = 0.22,
text = "Crime & Corrections",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.75,
y = 0.22,
text = "Natural Environment",
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
))
subplot(a,b,c,d,e,f,g,h, nrows=4)%>%
layout(title = 'State Rankings of Measures', annotations=annotations)
Create code that will return the top state in each category.
ranks$state[ranks$health_care==1]
## [1] "HI"
ranks$state[ranks$education==1]
## [1] "NJ"
ranks$state[ranks$economy==1]
## [1] "UT"
ranks$state[ranks$infrastructure==1]
## [1] "NV"
ranks$state[ranks$opportunity==1]
## [1] "IA"
ranks$state[ranks$fiscal_stability==1]
## [1] "AK"
ranks$state[ranks$crime_corrections==1]
## [1] "NH"
ranks$state[ranks$natural_environment==1]
## [1] "HI"
z<-plot_usmap(include = c(ranks$state[ranks$health_care==1]), color="red", labels=TRUE)+
labs(title="First in Health Care")
y<-plot_usmap(include = c(ranks$state[ranks$education==1]), color="orange", labels=TRUE)+
labs(title="First in Education")
x<-plot_usmap(include = c(ranks$state[ranks$economy==1]), color="yellow", labels=TRUE)+
labs(title="First in Economy")
w<-plot_usmap(include = c(ranks$state[ranks$infrastructure==1]), color="green", labels=TRUE)+
labs(title="First in Infrastructure")
v<-plot_usmap(include = c(ranks$state[ranks$opportunity==1]), color="lightblue", labels=TRUE)+
labs(title="First in Opportunity")
u<-plot_usmap(include = c(ranks$state[ranks$fiscal_stability==1]), color="blue", labels=TRUE)+
labs(title="First in Fiscal Stability")
t<-plot_usmap(include = c(ranks$state[ranks$crime_corrections==1]), color="violet", labels=TRUE)+
labs(title="First in Crime & Corrections")
s<-plot_usmap(include = c(ranks$state[ranks$natural_environment==1]), color="pink", labels=TRUE)+
labs(title="First in Natural Environment")
subplot(z,y,x,w,v,u,t,s, nrows=4)%>%
layout(title = 'Top States in Each Category', annotations=annotations)